import pandas as pd
import numpy as np
import plotly.graph_objs as go
import plotly.figure_factory as ff
from scipy.stats import norm, skew, kurtosis, poisson
super_states = pd.read_csv('super_covid_states.csv')
super_states
| countyFIPS | County Name | State | StateFIPS | population | 2020-06-01_cases | 2020-06-02_cases | 2020-06-03_cases | 2020-06-04_cases | 2020-06-05_cases | ... | 2020-12-25_deaths | 2020-12-26_deaths | 2020-12-27_deaths | 2020-12-28_deaths | 2020-12-29_deaths | 2020-12-30_deaths | 2020-12-31_deaths | 2021-01-01_deaths | 2021-01-02_deaths | 2021-01-03_deaths | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2013 | Aleutians East Borough | AK | 2 | 3337 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 1 | 2016 | Aleutians West Census Area | AK | 2 | 5634 | 0 | 0 | 0 | 0 | 3 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 2 | 2020 | Municipality of Anchorage | AK | 2 | 288000 | 232 | 235 | 250 | 258 | 263 | ... | 136 | 137 | 138 | 138 | 139 | 141 | 141 | 142 | 142 | 142 |
| 3 | 2050 | Bethel Census Area | AK | 2 | 18386 | 3 | 3 | 3 | 3 | 3 | ... | 15 | 15 | 15 | 15 | 15 | 15 | 15 | 15 | 15 | 15 |
| 4 | 2060 | Bristol Bay Borough | AK | 2 | 836 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 355 | 45083 | Spartanburg County | SC | 45 | 319785 | 637 | 645 | 650 | 687 | 701 | ... | 360 | 366 | 371 | 371 | 372 | 388 | 392 | 392 | 398 | 405 |
| 356 | 45085 | Sumter County | SC | 45 | 106721 | 387 | 396 | 408 | 414 | 426 | ... | 113 | 114 | 115 | 115 | 115 | 115 | 117 | 117 | 117 | 122 |
| 357 | 45087 | Union County | SC | 45 | 27316 | 37 | 40 | 40 | 40 | 40 | ... | 37 | 39 | 39 | 39 | 39 | 39 | 41 | 41 | 42 | 43 |
| 358 | 45089 | Williamsburg County | SC | 45 | 30368 | 238 | 245 | 246 | 248 | 254 | ... | 60 | 60 | 60 | 60 | 60 | 61 | 61 | 61 | 61 | 63 |
| 359 | 45091 | York County | SC | 45 | 280979 | 417 | 421 | 426 | 433 | 446 | ... | 172 | 178 | 178 | 178 | 178 | 178 | 180 | 180 | 183 | 186 |
360 rows × 439 columns
normal_state_stats = []
nj_data = super_states[super_states['State'] == 'NJ']
# Extract only the columns representing COVID-19 cases in New Jersey (filter out columns with deaths)
case_columns = [col for col in nj_data.columns if '_cases' in col]
nj_cases = nj_data[case_columns].sum(axis=0)
# Fit a normal distribution to the data
mean, std = norm.fit(nj_cases)
# Create a histogram of the actual data
histogram_data = go.Histogram(x=nj_cases, histnorm='probability density', name='Actual Data', opacity=0.75)
# Generate data for the normal distribution plot
x_values = np.linspace(min(nj_cases), max(nj_cases), 100)
pdf_values = norm.pdf(x_values, mean, std)
# Create the plot for the fitted normal distribution
distribution_plot = go.Scatter(x=x_values, y=pdf_values, mode='lines', name='Fitted Normal Distribution')
bin_edges = np.histogram_bin_edges(nj_cases, bins=10)
hist, bin_edges = np.histogram(nj_cases, bins=bin_edges, density=True)
bin_centers = 0.5 * (bin_edges[1:] + bin_edges[:-1])
fig = go.Figure(data=[histogram_data, distribution_plot])
fig.update_layout(
title='Fitted Normal Distribution for COVID-19 Cases in New Jersey',
xaxis_title='Number of Cases',
yaxis_title='Probability Density',
showlegend=True
)
fig.show()
pmf_plot = go.Bar(x=bin_centers, y=hist, name='PMF (Probability Mass Function)', opacity=0.75)
pmf_fig = go.Figure(data=[pmf_plot])
pmf_fig.update_layout(
title='Probability Mass Function (PMF) for COVID-19 Cases in New Jersey',
xaxis_title='Number of Cases',
yaxis_title='Probability',
showlegend=True
)
pmf_fig.show()
states = ['NJ', 'OH', 'AK', 'SC', 'NV', 'GA']
normal_state_stats = []
for state in states:
state_data = super_states[super_states['State'] == state]
case_columns = [col for col in state_data.columns if '_cases' in col]
state_cases = state_data[case_columns].sum(axis=0)
mean, std = norm.fit(state_cases)
histogram_data = go.Histogram(x=state_cases, histnorm='probability density', name=f'Actual Data ({state})', opacity=0.75)
x_values = np.linspace(min(state_cases), max(state_cases), 100)
pdf_values = norm.pdf(x_values, mean, std)
distribution_plot = go.Scatter(x=x_values, y=pdf_values, mode='lines', name=f'Fitted Normal Distribution ({state})')
fig = go.Figure(data=[histogram_data, distribution_plot])
fig.update_layout(
title=f'Fitted Normal Distribution for COVID-19 Cases in {state}',
xaxis_title='Number of Cases',
yaxis_title='Probability Density',
showlegend=True
)
fig.show()
# Calculate the statistics: mean, variance, skewness, and kurtosis
normal_state_stats.append({
'State': state,
'Cases Mean': mean,
'Cases Variance': std*2,
'Cases Skewness': skew(state_cases),
'Cases Kurtosis': kurtosis(state_cases),
})
norm_state_stats_df = pd.DataFrame(normal_state_stats)
norm_state_stats_df
| State | Cases Mean | Cases Variance | Cases Skewness | Cases Kurtosis | |
|---|---|---|---|---|---|
| 0 | NJ | 239239.539171 | 183738.566316 | 1.606140 | 1.621851 |
| 1 | OH | 211023.921659 | 371580.244441 | 1.370047 | 0.718562 |
| 2 | AK | 13352.525346 | 28378.773346 | 1.151845 | -0.034628 |
| 3 | SC | 137550.829493 | 157071.262970 | 0.261098 | -0.634143 |
| 4 | NV | 85063.677419 | 115726.700304 | 0.760187 | -0.203006 |
| 5 | GA | 266488.705069 | 276094.513462 | 0.031708 | -0.873437 |
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import poisson
file_path = 'super_covid_states.csv'
covid_data = pd.read_csv(file_path)
states = ['NJ', 'OH', 'AK', 'SC', 'NV', 'GA']
poisson_state_stats = []
for state in states:
state_df = covid_data[covid_data['State']== state]
daily_cases = state_df.filter(like='_cases').sum(axis=0)
daily_deaths = state_df.filter(like='_deaths').sum(axis=0)
# gets population of the state
state_population = state_df['population'].sum()
# mean cases per 100k
mean_cases = (daily_cases / state_population) * 100000
# mean deaths per 100k
mean_deaths = (daily_deaths / state_population) * 100000
poisson_cases = poisson(mu=mean_cases.mean())
poisson_deaths = poisson(mu=mean_deaths.mean())
case_mean, case_var = poisson_cases.mean(), poisson_cases.var()
death_mean, death_var = poisson_deaths.mean(), poisson_deaths.var()
case_skew = skew(mean_cases)
case_kurtosis = kurtosis(mean_cases)
death_skew = skew(mean_deaths)
death_kurtosis = kurtosis(mean_deaths)
poisson_state_stats.append({
'State': state,
'Cases Mean': case_mean,
'Cases Variance': case_var,
'Cases Skewness': case_skew,
'Cases Kurtosis': case_kurtosis,
'Deaths Mean': death_mean,
'Deaths Variance': death_var,
'Deaths Skewness': death_skew,
'Deaths Kurtosis': death_kurtosis
})
poisson_state_stats_df = pd.DataFrame(poisson_state_stats)
poisson_state_stats_df
| State | Cases Mean | Cases Variance | Cases Skewness | Cases Kurtosis | Deaths Mean | Deaths Variance | Deaths Skewness | Deaths Kurtosis | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | NJ | 2693.474686 | 2693.474686 | 1.606140 | 1.621851 | 179.344600 | 179.344600 | -0.829271 | 1.214119 |
| 1 | OH | 1805.305128 | 1805.305128 | 1.370047 | 0.718562 | 40.436703 | 40.436703 | 0.733016 | -0.162389 |
| 2 | AK | 1825.250032 | 1825.250032 | 1.151845 | -0.034628 | 10.205029 | 10.205029 | 1.262519 | 0.537980 |
| 3 | SC | 2671.557004 | 2671.557004 | 0.261098 | -0.634143 | 55.274417 | 55.274417 | -0.192374 | -1.249999 |
| 4 | NV | 2761.667832 | 2761.667832 | 0.760187 | -0.203006 | 46.657072 | 46.657072 | 0.343424 | -0.703444 |
| 5 | GA | 2509.918886 | 2509.918886 | 0.031708 | -0.873437 | 55.724770 | 55.724770 | -0.147432 | -1.421200 |
norm_state_stats_df
| State | Cases Mean | Cases Variance | Cases Skewness | Cases Kurtosis | |
|---|---|---|---|---|---|
| 0 | NJ | 239239.539171 | 183738.566316 | 1.606140 | 1.621851 |
| 1 | OH | 211023.921659 | 371580.244441 | 1.370047 | 0.718562 |
| 2 | AK | 13352.525346 | 28378.773346 | 1.151845 | -0.034628 |
| 3 | SC | 137550.829493 | 157071.262970 | 0.261098 | -0.634143 |
| 4 | NV | 85063.677419 | 115726.700304 | 0.760187 | -0.203006 |
| 5 | GA | 266488.705069 | 276094.513462 | 0.031708 | -0.873437 |
super_demographic_data = pd.read_csv('Super_Demographic_data.csv', low_memory=False)
states = ['NJ']
super_demographic = super_demographic_data.loc[super_demographic_data['State'].isin(states)].reset_index(drop=True)
super_demographic = super_demographic.rename(columns={'County Name_x': 'County Name', 'StateFIPS_cases': 'StateFIPS'})
start_date = '2020-06-01'
end_date = '2021-01-03'
columns_drop = [col for col in super_demographic.columns if (
('_cases' in col or '_deaths' in col) and not (start_date <= col.split('_')[0] <= end_date))]
super_demographic = super_demographic.drop(columns=columns_drop)
super_demographic.drop(super_demographic.columns[-1], axis=1, inplace=True)
deaths_drop = [col for col in super_demographic.columns if '_deaths' in col]
super_demographic = super_demographic.drop(columns=deaths_drop)
cols = super_demographic.columns.tolist()
# Rearrange the columns by moving 'population' to follow 'StateFIPS'
cols.insert(cols.index('StateFIPS') + 1, cols.pop(cols.index('population')))
# Reassign the DataFrame with the new column order
super_demographic = super_demographic[cols]
super_demographic
| countyFIPS | County Name | County Name | State | StateFIPS | population | 2020-06-01_cases | 2020-06-02_cases | 2020-06-03_cases | 2020-06-04_cases | ... | Percent HISPANIC OR LATINO AND RACE Total population Not Hispanic or Latino Two or more races Two races excluding Some other race, and Three or more races | Percent Margin of Error HISPANIC OR LATINO AND RACE Total population Not Hispanic or Latino Two or more races Two races excluding Some other race, and Three or more races | Percent Total housing units | Percent Margin of Error Total housing units | Percent CITIZEN, VOTING AGE POPULATION Citizen, 18 and over population | Percent Margin of Error CITIZEN, VOTING AGE POPULATION Citizen, 18 and over population | Percent CITIZEN, VOTING AGE POPULATION Citizen, 18 and over population Male | Percent Margin of Error CITIZEN, VOTING AGE POPULATION Citizen, 18 and over population Male | Percent CITIZEN, VOTING AGE POPULATION Citizen, 18 and over population Female | Percent Margin of Error CITIZEN, VOTING AGE POPULATION Citizen, 18 and over population Female | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 34001 | Atlantic County | Atlantic County | NJ | 34 | 263670 | 2222 | 2251 | 2284 | 2286 | ... | 2.8 | 0.4 | (X) | (X) | 191213 | (X) | 47.7 | 0.2 | 52.3 | 0.2 |
| 1 | 34003 | Bergen County | Bergen County | NJ | 34 | 932202 | 18302 | 18333 | 18376 | 18408 | ... | 1.6 | 0.1 | (X) | (X) | 641497 | (X) | 47.9 | 0.2 | 52.1 | 0.2 |
| 2 | 34005 | Burlington County | Burlington County | NJ | 34 | 445349 | 4657 | 4680 | 4700 | 4725 | ... | 3.2 | 0.2 | (X) | (X) | 339097 | (X) | 48.8 | 0.1 | 51.2 | 0.1 |
| 3 | 34007 | Camden County | Camden County | NJ | 34 | 506471 | 6543 | 6603 | 6645 | 6663 | ... | 2.1 | 0.2 | (X) | (X) | 370429 | (X) | 47.2 | 0.1 | 52.8 | 0.1 |
| 4 | 34009 | Cape May County | Cape May County | NJ | 34 | 92039 | 623 | 628 | 629 | 630 | ... | 2.3 | 0.4 | (X) | (X) | 74527 | (X) | 48.5 | 0.2 | 51.5 | 0.2 |
| 5 | 34011 | Cumberland County | Cumberland County | NJ | 34 | 149527 | 2310 | 2345 | 2431 | 2469 | ... | 2.2 | 0.4 | (X) | (X) | 105437 | (X) | 50.6 | 0.3 | 49.4 | 0.3 |
| 6 | 34013 | Essex County | Essex County | NJ | 34 | 798975 | 17695 | 17752 | 17811 | 17912 | ... | 1.6 | 0.2 | (X) | (X) | 519018 | (X) | 46.9 | 0.2 | 53.1 | 0.2 |
| 7 | 34015 | Gloucester County | Gloucester County | NJ | 34 | 291636 | 2245 | 2262 | 2278 | 2290 | ... | 2.1 | 0.4 | (X) | (X) | 222747 | (X) | 47.8 | 0.1 | 52.2 | 0.1 |
| 8 | 34017 | Hudson County | Hudson County | NJ | 34 | 672391 | 18428 | 18455 | 18460 | 18465 | ... | 1.7 | 0.2 | (X) | (X) | 399639 | (X) | 48.5 | 0.3 | 51.5 | 0.3 |
| 9 | 34019 | Hunterdon County | Hunterdon County | NJ | 34 | 124371 | 996 | 999 | 1006 | 1009 | ... | 1.2 | 0.3 | (X) | (X) | 96217 | (X) | 49.4 | 0.2 | 50.6 | 0.2 |
| 10 | 34021 | Mercer County | Mercer County | NJ | 34 | 367430 | 6961 | 7004 | 7033 | 7072 | ... | 1.8 | 0.2 | (X) | (X) | 250756 | (X) | 47.8 | 0.3 | 52.2 | 0.3 |
| 11 | 34023 | Middlesex County | Middlesex County | NJ | 34 | 825062 | 15977 | 16021 | 16066 | 16072 | ... | 1.6 | 0.2 | (X) | (X) | 545287 | (X) | 48.7 | 0.2 | 51.3 | 0.2 |
| 12 | 34025 | Monmouth County | Monmouth County | NJ | 34 | 618795 | 8249 | 8289 | 8332 | 8379 | ... | 1.6 | 0.1 | (X) | (X) | 461782 | (X) | 48.0 | 0.1 | 52.0 | 0.1 |
| 13 | 34027 | Morris County | Morris County | NJ | 34 | 491845 | 6506 | 6512 | 6540 | 6548 | ... | 1.9 | 0.2 | (X) | (X) | 354823 | (X) | 48.2 | 0.2 | 51.8 | 0.2 |
| 14 | 34029 | Ocean County | Ocean County | NJ | 34 | 607186 | 8770 | 8817 | 8853 | 8900 | ... | 1.4 | 0.1 | (X) | (X) | 442257 | (X) | 47.3 | 0.1 | 52.7 | 0.1 |
| 15 | 34031 | Passaic County | Passaic County | NJ | 34 | 501826 | 16200 | 16234 | 16277 | 16311 | ... | 1.0 | 0.1 | (X) | (X) | 318405 | (X) | 47.8 | 0.2 | 52.2 | 0.2 |
| 16 | 34033 | Salem County | Salem County | NJ | 34 | 62385 | 639 | 645 | 653 | 659 | ... | 2.0 | 0.4 | (X) | (X) | 48431 | (X) | 47.9 | 0.2 | 52.1 | 0.2 |
| 17 | 34035 | Somerset County | Somerset County | NJ | 34 | 328934 | 4595 | 4605 | 4619 | 4626 | ... | 1.5 | 0.2 | (X) | (X) | 228462 | (X) | 48.1 | 0.2 | 51.9 | 0.2 |
| 18 | 34037 | Sussex County | Sussex County | NJ | 34 | 140488 | 1114 | 1123 | 1128 | 1130 | ... | 1.5 | 0.2 | (X) | (X) | 110292 | (X) | 49.6 | 0.2 | 50.4 | 0.2 |
| 19 | 34039 | Union County | Union County | NJ | 34 | 556341 | 15858 | 15868 | 15911 | 15953 | ... | 1.5 | 0.2 | (X) | (X) | 354386 | (X) | 47.5 | 0.2 | 52.5 | 0.2 |
| 20 | 34041 | Warren County | Warren County | NJ | 34 | 105267 | 1165 | 1173 | 1182 | 1186 | ... | 1.6 | 0.3 | (X) | (X) | 81678 | (X) | 48.5 | 0.2 | 51.5 | 0.2 |
21 rows × 582 columns
file_path = 'demographic_NJ.csv'
df = pd.read_csv(file_path)
case_columns = [col for col in df.columns if 'cases' in col]
# Normalize the COVID-19 case data by population (cases per capita)
df_normalized = df.copy()
for col in case_columns:
df_normalized[col] = df[col] / df['population']
# Calculate the mean and median of the normalized cases for each county
df_normalized['mean_cases_per_capita'] = df_normalized[case_columns].mean(axis=1)
df_normalized['median_cases_per_capita'] = df_normalized[case_columns].median(axis=1)
df_summary = df_normalized[['County Name', 'mean_cases_per_capita', 'median_cases_per_capita']]
df_summary
| County Name | mean_cases_per_capita | median_cases_per_capita | |
|---|---|---|---|
| 0 | Atlantic County | 0.019406 | 0.014924 |
| 1 | Bergen County | 0.027584 | 0.023693 |
| 2 | Burlington County | 0.019647 | 0.015336 |
| 3 | Camden County | 0.024172 | 0.018969 |
| 4 | Cape May County | 0.012867 | 0.010735 |
| 5 | Cumberland County | 0.027190 | 0.024812 |
| 6 | Essex County | 0.031461 | 0.025970 |
| 7 | Gloucester County | 0.018710 | 0.014477 |
| 8 | Hudson County | 0.035718 | 0.030395 |
| 9 | Hunterdon County | 0.013242 | 0.010252 |
| 10 | Mercer County | 0.027169 | 0.023226 |
| 11 | Middlesex County | 0.027504 | 0.022993 |
| 12 | Monmouth County | 0.022905 | 0.018355 |
| 13 | Morris County | 0.019549 | 0.015649 |
| 14 | Ocean County | 0.025037 | 0.019772 |
| 15 | Passaic County | 0.043814 | 0.037362 |
| 16 | Salem County | 0.019356 | 0.016574 |
| 17 | Somerset County | 0.020284 | 0.017113 |
| 18 | Sussex County | 0.013062 | 0.010172 |
| 19 | Union County | 0.037346 | 0.031216 |
| 20 | Warren County | 0.016776 | 0.013528 |
Do counties with younger populations have a faster spread of COVID-19 cases?
What is the correlation between counties with larger populations of different racial groups and COVID-19 case numbers?
Is there a link between the number of men and women in a county and how many cases there are in the county?